EDA and cleaning the data¶

Ratings:¶

In [15]:
df_ratings.head()
Out[15]:
name rating review
0 Jalsa 4.0 A beautiful place to dine inThe interiors take...
1 Jalsa 4.0 I was here for dinner with my family on a week...
2 Jalsa 2.0 Its a restaurant near to Banashankari BDA Me a...
3 Jalsa 4.0 We went here on a weekend and one of us had th...
4 Jalsa 5.0 The best thing about the place is its ambiance...

Locations:¶

In [24]:
df_locations.head()
Out[24]:
Name latitude longitude
0 Bangalore Banashankari 12.915221 77.573598
1 Bangalore Basavanagudi 12.941726 77.575502
2 Bangalore Mysore Road 12.931409 77.506714
3 Bangalore Jayanagar 12.929273 77.582423
4 Bangalore Kumaraswamy Layout 12.906817 77.563525

Zomato:¶

In [42]:
df_zomato.head()
Out[42]:
name online_order book_table rate votes location rest_type cuisines avg_cost_for_2_people reviews_list menu_item listed_type listed_city
0 Jalsa Yes Yes 4.1 775 Banashankari Casual Dining North Indian, Mughlai, Chinese 800.0 [('Rated 4.0', 'RATED\n A beautiful place to ... [] Buffet Banashankari
1 Spice Elephant Yes No 4.1 787 Banashankari Casual Dining Chinese, North Indian, Thai 800.0 [('Rated 4.0', 'RATED\n Had been here for din... [] Buffet Banashankari
2 San Churro Cafe Yes No 3.8 918 Banashankari Cafe, Casual Dining Cafe, Mexican, Italian 800.0 [('Rated 3.0', "RATED\n Ambience is not that ... [] Buffet Banashankari
3 Addhuri Udupi Bhojana No No 3.7 88 Banashankari Quick Bites South Indian, North Indian 300.0 [('Rated 4.0', "RATED\n Great food and proper... [] Buffet Banashankari
4 Grand Village No No 3.8 166 Basavanagudi Casual Dining North Indian, Rajasthani 600.0 [('Rated 4.0', 'RATED\n Very good restaurant ... [] Buffet Banashankari

Data visualization¶

In [43]:
# Number of restaurants in a particular location

fig = plt.figure(figsize=(20,7))
loc = sns.countplot(x="location", data=df_zomato, palette = "Set1")
loc.set_xticklabels(loc.get_xticklabels(), rotation=90, ha="right")
plt.ylabel("Frequency", size=15)
plt.xlabel("Location", size=18)
plt.title('Number of restaurants in a Location', size = 20, pad=20)
plt.show()
In [45]:
# Most famous restaurant chains in Bengaluru

plt.figure(figsize=(15,7))
chains = df_zomato['name'].value_counts()[:20]
sns.barplot(x=chains, y=chains.index, palette='Set1')
plt.title("Most famous restaurant chains in Bangalore", size=20, pad=20)
plt.xlabel("Number of locals", size=15)
plt.show()
In [46]:
# Number of restaurants taking online order or not

plt.figure(figsize=(15,7))
df_zomato['online_order'].value_counts().plot.bar()
plt.title('Online orders', fontsize = 20)
plt.ylabel('Frequency', size = 15)
plt.show()
In [47]:
# Frequency of restaurants allowing booking table or not

plt.figure(figsize=(15,7))
df_zomato['book_table'].value_counts().plot.bar()
plt.title('Booking Table', fontsize = 20, pad=15)
plt.ylabel('Frequency', fontsize = 15)
plt.show()
In [49]:
# Restaurant rating distribution

plt.figure(figsize=(15,8))
rating = df_zomato['rate']
plt.hist(rating,bins=20, color="red")
plt.title('Restaurant rating distribution', size = 20, pad = 15)
plt.xlabel('Rating', size = 15)
plt.ylabel('No. of restaurants', size = 15)
plt.show()
In [50]:
# Approx cost for 2 people distribution

plt.figure(figsize=(15,8))
sns.violinplot(df_zomato.avg_cost_for_2_people)
plt.title('Approx cost for 2 people distribution', size = 20, pad = 15)
plt.xlabel('Approx cost for 2 people', size = 15)
plt.ylabel('Density', size = 15)
plt.show()

The approx cost for 2 people is around 300-400 INR.

In [51]:
# Most popular cuisines of Bangalore

plt.figure(figsize=(15,8))
cuisines = df_zomato['cuisines'].value_counts()[:15]
sns.barplot(cuisines,cuisines.index)
plt.title('Most popular cuisines of Bangalore', size = 20, pad = 15)
plt.xlabel('No. of restaurants', size = 15)
plt.show()

Ratings: NLP analysis¶

In [57]:
wordcloud = WordCloud(max_font_size=None, max_words=100, background_color="black", 
                      width=3000, height=2000, collocations=False,
                      stopwords = stopwords.words('english')).generate(str(df_ratings.review.values))

plot_wordcloud(wordcloud, 'English')

Sentiment analysis¶

In [58]:
rate_analyzer = SentimentIntensityAnalyzer()

def sentiment_analyzer_scores(sentence):
    score = rate_analyzer.polarity_scores(sentence)
    return score
In [62]:
ratings.head()
Out[62]:
name rating review neg neu pos compound
0 Jalsa 4.0 A beautiful place to dine inThe interiors take... 0.062 0.799 0.139 0.7430
1 Jalsa 4.0 I was here for dinner with my family on a week... 0.027 0.684 0.289 0.9623
2 Jalsa 2.0 Its a restaurant near to Banashankari BDA Me a... 0.047 0.781 0.172 0.7964
3 Jalsa 4.0 We went here on a weekend and one of us had th... 0.000 0.725 0.275 0.9678
4 Jalsa 5.0 The best thing about the place is its ambiance... 0.000 0.605 0.395 0.9702
In [63]:
# all scores in 4 histograms
fig, axes = plt.subplots(2, 2, figsize=(10,8))

# plot all 4 histograms
ratings.hist('neg', bins=25, ax=axes[0,0], color='lightcoral', alpha=0.6)
axes[0,0].set_title('Negative Sentiment Score')

ratings.hist('neu', bins=25, ax=axes[0,1], color='lightsteelblue', alpha=0.6)
axes[0,1].set_title('Positive Sentiment Score')

ratings.hist('pos', bins=25, ax=axes[1,0], color='chartreuse', alpha=0.6)
axes[1,0].set_title('Neutral Sentiment Score')

ratings.hist('compound', bins=25, ax=axes[1,1], color='navajowhite', alpha=0.6)
axes[1,1].set_title('Compound')

# plot common x- and y-label
fig.text(0.5, 0.04, 'Sentiment Scores',  fontweight='bold', ha='center')
fig.text(0.04, 0.5, 'Number of Reviews', fontweight='bold', va='center', rotation='vertical')

# plot title
plt.suptitle('Sentiment Analysis of Zomato reviews\n\n', fontsize=12, fontweight='bold');

Comparing Negative and Positive Comments¶

In [64]:
# full dataframe with POSITIVE comments
df_pos = ratings.loc[ratings.compound >= 0.95]

# only corpus of POSITIVE comments
pos_comments = df_pos['review'].tolist()
pos_comments[:3]
Out[64]:
['I was here for dinner with my family on a weekday The restaurant was completely empty Ambience is good with some good old hindi music Seating arrangement are good too We ordered masala papad panner and baby corn starters lemon and corrionder soup butter roti olive and chilli paratha Food was fresh and good service is good too Good for family hangout\nCheers',
 'We went here on a weekend and one of us had the buffet while two of us took Ala Carte Firstly the ambience and service of this place is great The buffet had a lot of items and the good was good We had a Pumpkin Halwa intm the dessert which was amazing Must try The kulchas are great here Cheers',
 'The best thing about the place is its ambiance Second best thing was yummy  food We try buffet and buffet food was not disappointed us\nTest      \nQuality  \nService Staff was very professional and friendly\n\nOverall experience was excellent\n\nsubirmajumder85wixsitecom']
In [65]:
# full dataframe with NEGATIVE comments
df_neg = ratings.loc[ratings.compound < 0.0]

# only corpus of NEGATIVE comments
neg_comments = df_neg['review'].tolist()
neg_comments[:3]
Out[65]:
['I had a very bad experience here\nI dont know about a la carte but the buffet was the worst They gave us complementary drink and momos before the buffet The momos were really good\nThe number of varieties first of all was very disappointing The service was very slow They refilled the food very slowly The starters were okay The main course also was so so There was two gravies with roti and some rice with raitha They had chats sev puri and pan puri which was average But the desert was disappointing They had gulab Jamun and chocolate cake The jamun was not cooked inside There was a cold blob of raw dough inside The chocolate cake also was really hard and not that good\nOverall the buffet was a bad experience for me',
 'Spice elephant soup SPL almost manchow flavour soup Just above medium spicy\n\nLasooni fish tikka was awesome\n\nI dont remember the dessert name but I have attached the photo  It had vanilla ice inside wafers Wafer was hell hard egg smell chewy  Nightmare dessert \n\nTable leg space was very bad I was so uncomfortable the whole time kept on adjusting my legs\n\nNo parking\n\nFor the taste felt this is too costly',
 'Ambience is not that good enough and its not a pocket friendly cafe and the quantity is not that good and desserts are too good enough ']
In [67]:
sns.set_style("whitegrid")
plt.figure(figsize=(8,5))

sns.distplot(df_pos['text_length'], kde=True, bins=50, color='chartreuse')
sns.distplot(df_neg['text_length'], kde=True, bins=50, color='lightcoral')

plt.title('\nDistribution Plot for Length of Comments\n')
plt.legend(['Positive Comments', 'Negative Comments'])
plt.xlabel('\nText Length')
plt.ylabel('Percentage of Comments\n');

It looks like the positive comments have more length than negative ones.

Topic modelling - Genism¶

In [84]:
for i in model_loaded.print_topics():
    print('Topic number {} \n'.format(i[0]))
    for j in i: 
        print (j)
Topic number 0 

0
0.029*"food" + 0.022*"order" + 0.015*"time" + 0.012*"delivery" + 0.011*"bad" + 0.010*"ordered" + 0.010*"dont" + 0.009*"restaurant" + 0.009*"even" + 0.008*"service"
Topic number 1 

1
0.045*"place" + 0.039*"good" + 0.038*"food" + 0.019*"service" + 0.015*"ambience" + 0.013*"great" + 0.010*"nice" + 0.010*"staff" + 0.009*"visit" + 0.008*"really"
Topic number 2 

2
0.019*"pizza" + 0.016*"cake" + 0.015*"place" + 0.010*"try" + 0.010*"chocolate" + 0.010*"good" + 0.010*"cheese" + 0.010*"one" + 0.009*"ordered" + 0.009*"cream"
Topic number 3 

3
0.049*"chicken" + 0.030*"good" + 0.021*"biryani" + 0.020*"ordered" + 0.019*"taste" + 0.019*"food" + 0.013*"rice" + 0.010*"paneer" + 0.010*"veg" + 0.009*"quantity"

Word Cloud visualizations of the topics¶

In [88]:
wordcloud = WordCloud(background_color="black", width=2500, height=1800,
                      stopwords = stopwords.words('english')).generate_from_frequencies(first_topic_words)

plot_wordcloud(wordcloud, '\nWord cloud of First Topic')
In [89]:
wordcloud = WordCloud(background_color="black", width=2500, height=1800,
                      stopwords = stopwords.words('english')).generate_from_frequencies(second_topic_words)

plot_wordcloud(wordcloud, '\nWord cloud of Second Topic')
In [90]:
wordcloud = WordCloud(background_color="black", width=2500, height=1800,
                      stopwords = stopwords.words('english')).generate_from_frequencies(third_topic_words)

plot_wordcloud(wordcloud, '\nWord cloud of Third Topic')
In [91]:
wordcloud = WordCloud(background_color="black", width=2500, height=1800,
                      stopwords = stopwords.words('english')).generate_from_frequencies(fourth_topic_words)

plot_wordcloud(wordcloud, '\nWord cloud of Fourth Topic')

Evaluation¶

Compute Model Perplexity and Coherence Score¶

Model perplexity and topic coherence provide a convenient measure to judge how good a given topic model is.

In [92]:
# Compute Perplexity
print('\nPerplexity: ', model_loaded.log_perplexity(doc_term_matrix))  # A measure of how good the model is. The lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=model_loaded, texts=ratings_topic_model['text_cleaned'], 
                                     dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)
Perplexity:  -7.461145081504907

Coherence Score:  0.5021451296691057
In [93]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(model_loaded, doc_term_matrix, dictionary)
vis
Out[93]:

Restaurant recommendator¶

In [100]:
def filter_by(df, cuisines, rate):
    df_outcome = df
    if cuisines:
        df_outcome = df_outcome[df_outcome['cuisines'].str.contains(cuisines)]
    if rate:
        df_outcome = df_outcome[df_outcome['rate'] >= rate]
    return df_outcome
In [101]:
"""
Returns a dataframe that recomends restaurants based on name, reviews, cuisine and rate.

Parameters
----------
name : str
       Restaurant name that you liked
       
cuisines : str, optional
           Type of cuisine you are looking for
           
rate: float, optional
      Minimum rate you want for the recomended restaurants

"""
def get_recommendations(name, cuisines = None, rate = None):
    recommend_restaurant = []
    
    # Find the index of the hotel entered
    idx = indices[indices == name].index[0]
    
    # Find the restaurants with a similar cosine-sim value and order them from bigges number    
    score_series = pd.Series(cosine_similarities[idx]).sort_values(ascending=False)
    
    # Extract top 30 restaurant indexes with a similar cosine-sim value
    top30_indexes = list(score_series.iloc[0:31].index)
    
    # Names of the top 30 restaurants
    for each in top30_indexes:
        recommend_restaurant.append(list(df_percent.index)[each])
    
    # Creating the new data set to show similar restaurants
    df_new = pd.DataFrame(columns=['cuisines', 'rate', 'avg_cost_for_2_people'])
    
    # Create the top 30 similar restaurants with some of their columns
    for each in recommend_restaurant:
        df_new = df_new.append(pd.DataFrame(df_percent[['cuisines','rate', 'avg_cost_for_2_people']][df_percent.index == each].sample()))
    
    # Drop the same named restaurants and sort only the top 10 by the highest rating
    df_new = df_new.drop_duplicates(subset=['cuisines','rate', 'avg_cost_for_2_people'], keep=False)
    df_new = df_new.sort_values(by='rate', ascending=False).head(10)
    
    print('TOP %s RESTAURANTS LIKE %s : ' % (str(len(df_new)), name))
    
    return filter_by(df_new, cuisines, rate)
In [102]:
get_recommendations('Jalsa', 'North Indian', 4.5)
TOP 2 RESTAURANTS LIKE Jalsa : 
Out[102]:
cuisines rate avg_cost_for_2_people
In [103]:
get_recommendations('China Bowl', 'Chinese', 3)
TOP 4 RESTAURANTS LIKE China Bowl : 
Out[103]:
cuisines rate avg_cost_for_2_people
Yo! Chow Chinese, Momos 4.4 800.0
Green Onion Chinese 4.3 550.0
Green Onion Chinese 3.3 400.0
Chinese Street Chinese 3.1 650.0
In [104]:
get_recommendations('Dabba Karkhana')
TOP 8 RESTAURANTS LIKE Dabba Karkhana : 
Out[104]:
cuisines rate avg_cost_for_2_people
Hunger Meals South Indian, North Indian, Biryani 3.8 400.0
Agarwal Food Service North Indian, Chinese, Biryani 3.8 400.0
Cinnamon North Indian, Asian, Continental 3.7 1000.0
Swad Punjab Da North Indian 3.7 150.0
Melange - Hotel Ekaa North Indian, Chinese, Continental, Mangalorean 3.2 900.0
Mast Kalandar North Indian 2.5 450.0
Mast Kalandar North Indian 2.4 450.0
Mast Kalandar North Indian 2.3 450.0

Locations - map¶

In [110]:
import plotly.express as px

px.set_mapbox_access_token('pk.eyJ1IjoibnRzaGNob25nIiwiYSI6ImNrbWRxaHI2eDJvNTYydW53MmNjejg5emcifQ.TPLX9hTzCJUr41zLPxxxfQ')

fig = px.scatter_mapbox(location_coords, lat="latitude", lon="longitude",  color="rest_count", size="rest_count",
                        hover_name='short_name', color_continuous_scale=px.colors.cyclical.IceFire, size_max=60, zoom=12)
fig.show();
In [ ]: